A <- 1:5
B <- 11:15
names(A) <- A
names(B) <- B
A
B
View(anscombe)
lm(y3~x3, data = anscombe)
##-- now some "magic" to do the 4 regressions in a loop:
ff <- y ~ x
mods <- setNames(as.list(1:4), paste0("lm", 1:4))
for(i in 1:4) {
ff[2:3] <- lapply(paste0(c("y","x"), i), as.name)
## or ff[[2]] <- as.name(paste0("y", i))
## ff[[3]] <- as.name(paste0("x", i))
mods[[i]] <- lmi <- lm(ff, data = anscombe)
print(anova(lmi))
}
## See how close they are (numerically!)
sapply(mods, coef)
lapply(mods, function(fm) coef(summary(fm)))
## Now, do what you should have done in the first place: PLOTS
op <- par(mfrow = c(2, 2), mar = 0.1+c(4,4,1,1), oma = c(0, 0, 2, 0))
for(i in 1:4) {
ff[2:3] <- lapply(paste0(c("y","x"), i), as.name)
plot(ff, data = anscombe, col = "red", pch = 21, bg = "orange", cex = 1.2,
xlim = c(3, 19), ylim = c(3, 13))
abline(mods[[i]], col = "blue")
}
mtext("Anscombe's 4 Regression data sets", outer = TRUE, cex = 1.5)
par(op)
x = c(TRUE, FALSE, FALSE)
typeof(x) #logical (vector)
mode(x)
storage.mode(x)
y = 1:10
typeof(y)
mode(y)
storage.mode(y)
dim(y)
length(y)
z= list(1, TRUE, 'safs') #trying to get a list
typeof(z)
class(z)
z[3]
length(z)
dim(z)
quote(x+y)
as.list(quote(x + y))
1e3L #create constant
1L
cat(1+2)
'+'(1, 2)
x <- options()
x$prompt
match(NA, NaN)
match(NA, NA)
match(NaN, NaN)
x = array(1:8, c(2,4))
x
i=2
j=3
x[i]
x[i, j]
x[[i]]
x[[i, j]]
rownames(x)=c("a","b")
x
x = as.data.frame(x)
x
x["a",]
x[]
i <- matrix(1:4, 2, byrow = TRUE)
i
i[2,]
i[2, ,drop=FALSE] # keeping dimension 1 * n when selection a row
dim(i[2,])
dim(i[2, ,drop=FALSE])
# https://cran.r-project.org/doc/manuals/R-intro.pdf
help.start()
x <- rnorm(10000)
y <- rnorm(x)
plot(x, y)
hist(y)
ls()
rm(list=ls())
ls()
x <- 1:20
w <- 1 + sqrt(x)/2
dummy <- data.frame(x=x, y= x + rnorm(x)*w)
dummy
# 4 Ordered and unordered factors
state <- c("tas", "sa", "qld", "nsw", "nsw", "nt", "wa", "wa",
"qld", "vic", "nsw", "vic", "qld", "qld", "sa", "tas",
"sa", "nt", "wa", "vic", "qld", "nsw", "nsw", "wa",
"sa", "act", "nsw", "vic", "vic", "act")
statef <- factor(state)
statef
levels(statef)
incomes <- c(60, 49, 40, 61, 64, 60, 59, 54, 62, 69, 70, 42, 56,
61, 61, 61, 58, 51, 48, 65, 49, 49, 41, 48, 52, 46,
59, 46, 58, 43)
incmeans <- tapply(incomes, statef, mean)
incmeans
# Arrays
a <- array(1:30, dim=c(2, 5,3))
a
x <- array(1:20, dim=c(4,5)) # Generate a 4 by 5 array.
x
i <- array(c(1:3,3:1), dim=c(3,2))
i
x[i] #get the 3 elements shown by i: (3, 1), (2, 2) and (1, 3)
help("<-")
# Back to http://zoonek2.free.fr/UNIX/48_R/02.html
x <- rnorm(10)
x
sort(x)
order(x)
x[order(x)]
x <- sample(1:5, 10, replace=T)
x
x[order(x)]
unique(x)
seq(0,10, length=11) == seq(0,10, by=1)
# Rep
rep(0, 10)
rep(1:5,3)
rep(1:5, each=3)
rep(1:5,2,each=3)
# Factor
x <- factor( sample(c("Yes", "No", "Perhaps"), 5, replace=T) )
x
# specify levels
l <- c("Yes", "No", "Perhaps")
x <- factor( sample(l, 5, replace=T), levels=l )
x
table(x)
# gl: Generate Factor Levels
gl(1,4)
gl(2,4)
gl(2,1,8)
gl(2,1,8, labels=c(T,F))
x <- gl(2,4)
x
y <- gl(2,1,8)
y
interaction(x,y)
data.frame(x,y, int=interaction(x,y))
# Cartesian product (toutes les possibilites)
x <- c("A", "B", "C")
y <- 1:2
z <- c("a", "b")
expand.grid(x,y,z)
x <- factor(c(3,4,5,1))
as.numeric( levels(x))
as.numeric( levels(x)[ x ] ) # proper way to convert from factor to numeric
# Data Frames
n <- 10
df <- data.frame( x=rnorm(n), y=sample(c(T,F),n,replace=T) )
str(df)
summary(df)
names(df);cat(rownames(df))
# Merge
merge(x, y) # INNER JOIN
merge(x, y, all.x = TRUE) # LEFT JOIN
merge(x, y, all.y = TRUE) # RIGHT JOIN
merge(x, y, all = TRUE) # OUTER JOIN
merge(a, b, by=c("y", "z")) # specify what to merge on
Error in fix.by(by.x, x) : 'by' must specify uniquely valid columns
# Regression
data(cars)
#View(cars)
# Regression
lm.fit=lm( dist ~ speed, data=cars, na.action = na.exclude)
lm.fit
# Polynomial regression
lm.fit3 = lm( dist ~ poly(speed,3), data=cars)
#plot(lm.fit)
plot(cars$speed, cars$dist)
abline(lm.fit)
# Lists
h <- list()
h[["foo"]] <- 1
h[["bar"]] <- c("a", "b", "c")
h["bar"] == h[["bar"]] #h["bar"] is a list containing the vector
# Delete
h[["bar"]] <- NULL
m <- matrix( c(1,2,3,4), nrow=2 )
m
solve(m)
x <- matrix( c(6,7), nrow=2 )
solve(m, x)
?solve
n <- 1000
x1 <- factor( sample(1:3, n, replace=T), levels=1:3 )
x2 <- factor( sample(LETTERS[1:5], n, replace=T), levels=LETTERS[1:5] )
x3 <- factor( sample(c(F,T),n,replace=T), levels=c(F,T) )
d <- data.frame(x1,x2,x3)
r <- table(d)
r
ftable(d) #easier reading
# contingency table into a data.frame
n <- 100
k <- 10
x <- factor( sample(LETTERS[1:k], n, replace=T), levels=LETTERS[1:k] )
x
d <- table(x)
x2 = factor( rep(names(d),d), levels=names(d) )
x2
# apply
options(digits=4)
df <- data.frame(x=rnorm(20),y=rnorm(20),z=rnorm(20))
apply(df,2,mean)
rownames(df) <- LETTERS[1:20]
apply(df, 1, mean)
gl(2,10,20)
tapply(1:20, gl(2,10,20), sum) # tapply: 2nd argument used for grouping
by(1:20, gl(2,10,20), sum)
x <- list(a=rnorm(10), b=runif(100), c=rgamma(50,1))
sapply(x,sd) # sapply: apply FUN on each element of vector
lapply(x,sd) # lapply: same but returns list
# Exercise: Let x be a boolean vector. Count the number of sequences ("runs") of zeros (for instance, in 00101001010110, there are 6 runs: 00 0 00 0 0 0). Count the number of sequences of 1. Counth the total number of sequences. Same question for a factor with more tham two levels.
n <- 50
x <- sample(0:1, n, replace=T, p=c(.2,.8))
x
diff(x, lag=1)
#Let r be the return of a financial asset. The clustered return is the accumulated return for a sequence of returns of the same sign. The trend number is the number of steps in such a sequence. The average return is their ratio. Compute these quantities.
data(EuStockMarkets)
x <- EuStockMarkets
# We aren't interested in the spot prices, but in the returns
# return[i] = ( price[i] - price[i-1] ) / price[i-1]
y <- apply(x, 2, function (x) { diff(x)/x[-length(x)] })
# We normalize the data
z <- apply(y, 2, function (x) { (x-mean(x))/sd(x) })
# A single time series
r <- z[,1]
# The runs
f <- factor(cumsum(abs(diff(sign(r))))/2)
r <- r[-1]
accumulated.return <- tapply(r, f, sum)
trend.number <- table(f)
boxplot(abs(accumulated.return) ~ trend.number, col='pink',
main="Accumulated return")
# Strings
print("Hello\n")
cat("Hello\n") #use cat
paste("Hello", "World", "!", sep="") #concatenate
paste("Hello ", " World", "!", sep="")
x <- 5
paste("x=", x)
cat("x=", x, "\n", sep="\n")
s <- c("Hello", " ", "World", "!")
paste(s)
paste(s, sep="")
paste(s, collapse="")
paste(1:3, "Hello World!", sep=":")
nchar("Hello World!")
s <- "Hello World"
substring(s, 4, 6)
s <- "foo-->bar-->baz"
strsplit(s, "-->")
# Regex
s <- "foo, bar, baz"
strsplit(s, ", *")
s <- apply(matrix(LETTERS[1:24], nr=4), 2, paste, collapse="")
s
grep("O", s)
grep("O", s, value=T)
regexpr("o", "Hello")
regexpr("o", c("Hello", "World!"))
s <- "foo bar baz"
gsub(" ", "", s) # Remove all the spaces
gsub(" +", " ", s) # Remove multiple spaces and replace them by single spaces
#The "sub" is similar to "gsub" but only replaces the first occurrence.
s <- "foo bar baz"
sub(" ", "", s)
# Dates
as.Date("2005-05-15") #ISO 8601
as.Date("15/05/2005", format="%d/%m/%Y")
as.Date("15/05/05", format="%d/%m/%y")
cat("\n")
as.Date("01/02/03", format="%y/%m/%d")
as.Date("01/02/03", format="%y/%d/%m")
as.Date("01/02/03", format="%y/%m/%d") - as.Date("01/02/03", format="%y/%d/%m")
Sys.Date()
format(Sys.Date(), format="%A, %d %B %Y")
seq(as.Date("2005-01-01"), as.Date("2005-07-01"), by="month")
seq(as.Date("2005-01-01"), as.Date("2005-07-01"), by=31)
methods(class="Date")
as.POSIXlt("2005-05-15 21:45:17")
as.POSIXct(Sys.Date())
# Reading from dataframes
# option 1
#d <- read.table("foo.txt")
#d$Date <- as.Date( as.character( d$Date ) )
# option 2
#read.table("foo.txt", colClasses=c("Date", "character", rep(10, "numeric")))
options(warn=1)
methods(plot)
# Import
# d <- read.table("foo.txt", header=T, sep=",")
# d <- read.csv("txt.csv")
# d <- read.csv2("txt.csv") # semicolon-separated file, with a
# # comma instead of the decimal point.
# d <- read.delim("foo.txt") # Tab-delimited file
# d <- read.fwf("txt.fwf") # Fixed width fields
# Excel: this may be trickier: the missing values often appear as "#N/A!" and are mistaken for the start of a comment... You can try
# d <- read.table("foo.csv", header = TRUE, sep = ",",
# na.strings = c("#N/A!", "NA", "@NA"),
# quote = '"',
# comment.char = "")
#If your file only contains number, or only strings, it is wiser to store it in a matrix, not a data.frame. This is what the "scan" function does.
# A numeric matrix
# x <- scan("foo.txt", sep=",") # Gives a numeric vector
# n <- scan("foo.txt", sep=",", nlines=1)
# x <- matrix(x, nc=n)
# A vector of strings
#x <- scan("foo.txt", what=character(0))
# Back tohttps://cran.r-project.org/doc/manuals/R-intro.pdf - Regression
# fm05 <- lm(y ~ x1 + x2 + x3 + x4 + x5, data = production)
# fm6 <- update(fm05, . ~ . + x6)
# smf6 <- update(fm6, sqrt(.) ~ .)
# Spine (from help)
require(graphics)
op <- par(mfrow = c(2,1), mgp = c(2,.8,0), mar = 0.1+c(3,3,3,1))
n <- 9
x <- 1:n
y <- rnorm(n)
plot(x, y, main = paste("spline[fun](.) through", n, "points"))
lines(spline(x, y))
lines(spline(x, y, n = 210), col = 2)
# NA handling - http://thomasleeper.com/Rcourse/Tutorials/NA.html
g1 <- c(1, 2, NA, NA, NA, 6, 7)
g2 <- na.omit(g1)
g2
attributes(g2)$na.action
sum(g1)
sum(g1, na.rm = TRUE)
# Cor -> can eliminate only pair-wise NAs ()
x <- c(1, 2, 3, NA, 5, 7, 9)
y <- c(3, 2, 4, 5, 1, 3, 4)
z <- c(NA, 2, 3, 5, 4, 3, 4)
m <- data.frame(x, y, z)
m
cor(m) # returns all NAs
x y z
x 1 NA NA
y NA 1 NA
z NA NA 1
cor(m, use = "complete.obs")
x y z
x 1.0000 0.34819 0.70957
y 0.3482 1.00000 0.04583
z 0.7096 0.04583 1.00000
cor(m, use = "pairwise.complete.obs")
x y z
x 1.0000 0.2498 0.7096
y 0.2498 1.0000 0.4534
z 0.7096 0.4534 1.0000
# Defaut for lm is also casewise deletion
lm <- lm(y ~ x + z, data = m)
summary(lm)
Call:
lm(formula = y ~ x + z, data = m)
Residuals:
2 3 5 6 7
-0.632 1.711 -1.237 -0.447 0.605
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.316 3.399 0.98 0.43
x 0.289 0.408 0.71 0.55
z -0.632 1.396 -0.45 0.70
Residual standard error: 1.65 on 2 degrees of freedom
(2 observations deleted due to missingness)
Multiple R-squared: 0.203, Adjusted R-squared: -0.594
F-statistic: 0.254 on 2 and 2 DF, p-value: 0.797
# Checking length of data used for regression
length(m$y)
[1] 7
length(lm$fitted)
[1] 5
# checking where missing data is
is.na(m) # only useful for small examples
x y z
[1,] FALSE FALSE TRUE
[2,] FALSE FALSE FALSE
[3,] FALSE FALSE FALSE
[4,] TRUE FALSE FALSE
[5,] FALSE FALSE FALSE
[6,] FALSE FALSE FALSE
[7,] FALSE FALSE FALSE
# image
image(is.na(m), main = "Missing Values", xlab = "Observation", ylab = "Variable",
xaxt = "n", yaxt = "n", bty = "n")
axis(1, seq(0, 1, length.out = nrow(m)), 1:nrow(m), col = "white")
axis(2, c(0, 0.5, 1), names(m), col = "white", las = 2)

# to remove casewise:
m2 <- na.omit(m) # use new variable to keep original dataframe
m
m2
# Mean and Random imputation
x2 <- x
x2[is.na(x2)] <- mean(x2, na.rm = TRUE)
x
[1] 1 2 3 NA 5 7 9
x2
[1] 1.0 2.0 3.0 4.5 5.0 7.0 9.0
# Random imputation - conserve mean and variance. Sample rest of the values to fill NAs
x3 <- x
x3[is.na(x3)] <- sample(x3[!is.na(x3)], sum(is.na(x3)), TRUE)
x
[1] 1 2 3 NA 5 7 9
x3
[1] 1 2 3 5 5 7 9
# Saving R data http://thomasleeper.com/Rcourse/Tutorials/savingdata.html
set.seed(1)
mydf <- data.frame(x = rnorm(10), y = rnorm(10), z = rnorm(10))
save(mydf, file = "saveddf.RData")
unlink("saveddf.RData")
# dput to have a readable format (e.g. for stack overflow)
dput(mydf)
structure(list(x = c(-0.626453810742332, 0.183643324222082, -0.835628612410047,
1.59528080213779, 0.329507771815361, -0.820468384118015, 0.487429052428485,
0.738324705129217, 0.575781351653492, -0.305388387156356), y = c(1.51178116845085,
0.389843236411431, -0.621240580541804, -2.2146998871775, 1.12493091814311,
-0.0449336090152309, -0.0161902630989461, 0.943836210685299,
0.821221195098089, 0.593901321217509), z = c(0.918977371608218,
0.782136300731067, 0.0745649833651906, -1.98935169586337, 0.61982574789471,
-0.0561287395290008, -0.155795506705329, -1.47075238389927, -0.47815005510862,
0.417941560199702)), .Names = c("x", "y", "z"), row.names = c(NA,
-10L), class = "data.frame")
dput(mydf, "saveddf.txt")
mydf2 <- dget("saveddf.txt")
mydf2
mydf==mydf2
x y z
[1,] FALSE FALSE FALSE
[2,] FALSE FALSE FALSE
[3,] FALSE FALSE TRUE
[4,] FALSE TRUE FALSE
[5,] FALSE FALSE FALSE
[6,] FALSE FALSE FALSE
[7,] FALSE FALSE FALSE
[8,] FALSE FALSE FALSE
[9,] FALSE FALSE FALSE
[10,] TRUE FALSE FALSE
unlink("saveddf.text")
# csv
write.csv(mydf, file = "saveddf.csv")
unlink("savedf.csv")
# Dataframe rearrangement
set.seed(50)
mydf <- data.frame(a = rep(1:2, each = 10), b = rep(1:4, times = 5), c = rnorm(20),
d = rnorm(20), e = sample(1:20, 20, FALSE))
head(mydf)
# manual order change
head(mydf[, c("c", "d", "e", "a", "b")])
# mydf <- mydf[, c(3, 4, 5, 1, 2)]
# using order
order(mydf$e)
[1] 2 16 18 19 20 14 3 4 5 12 1 11 13 15 7 8 10 9 6 17
head(mydf[order(mydf$e), ])
# Subset
mydf[mydf$a == 1, ]
mydf[mydf$a == 1 & mydf$b > 2, ]
subset(mydf, a == 1 & b > 2)
subset(mydf, select = c("a", "b"))
# Splitting
split(mydf, mydf$a)
$`1`
$`2`
NA
split(mydf, list(mydf$a, mydf$b))
$`1.1`
$`2.1`
$`1.2`
$`2.2`
$`1.3`
$`2.3`
$`1.4`
$`2.4`
lapply(split(mydf, mydf$a), summary)
$`1`
a b c d e
Min. :1 Min. :1.00 Min. :-1.728 Min. :-1.590 Min. : 1.00
1st Qu.:1 1st Qu.:1.25 1st Qu.:-0.779 1st Qu.:-0.359 1st Qu.: 8.25
Median :1 Median :2.00 Median :-0.122 Median : 0.193 Median :13.00
Mean :1 Mean :2.30 Mean :-0.244 Mean : 0.299 Mean :12.10
3rd Qu.:1 3rd Qu.:3.00 3rd Qu.: 0.483 3rd Qu.: 0.568 3rd Qu.:16.75
Max. :1 Max. :4.00 Max. : 0.976 Max. : 2.668 Max. :19.00
$`2`
a b c d e
Min. :2 Min. :1.00 Min. :-1.166 Min. :-1.1304 Min. : 2.00
1st Qu.:2 1st Qu.:2.00 1st Qu.:-0.488 1st Qu.:-0.6338 1st Qu.: 4.25
Median :2 Median :3.00 Median :-0.343 Median : 0.2961 Median : 8.00
Mean :2 Mean :2.70 Mean :-0.268 Mean : 0.0793 Mean : 8.90
3rd Qu.:2 3rd Qu.:3.75 3rd Qu.: 0.108 3rd Qu.: 0.4137 3rd Qu.:12.75
Max. :2 Max. :4.00 Max. : 0.555 Max. : 1.8397 Max. :20.00
lapply(split(mydf, mydf$a), summary)
$`1`
a b c d e
Min. :1 Min. :1.00 Min. :-1.728 Min. :-1.590 Min. : 1.00
1st Qu.:1 1st Qu.:1.25 1st Qu.:-0.779 1st Qu.:-0.359 1st Qu.: 8.25
Median :1 Median :2.00 Median :-0.122 Median : 0.193 Median :13.00
Mean :1 Mean :2.30 Mean :-0.244 Mean : 0.299 Mean :12.10
3rd Qu.:1 3rd Qu.:3.00 3rd Qu.: 0.483 3rd Qu.: 0.568 3rd Qu.:16.75
Max. :1 Max. :4.00 Max. : 0.976 Max. : 2.668 Max. :19.00
$`2`
a b c d e
Min. :2 Min. :1.00 Min. :-1.166 Min. :-1.1304 Min. : 2.00
1st Qu.:2 1st Qu.:2.00 1st Qu.:-0.488 1st Qu.:-0.6338 1st Qu.: 4.25
Median :2 Median :3.00 Median :-0.343 Median : 0.2961 Median : 8.00
Mean :2 Mean :2.70 Mean :-0.268 Mean : 0.0793 Mean : 8.90
3rd Qu.:2 3rd Qu.:3.75 3rd Qu.: 0.108 3rd Qu.: 0.4137 3rd Qu.:12.75
Max. :2 Max. :4.00 Max. : 0.555 Max. : 1.8397 Max. :20.00
# sampling
s <- sample(1:nrow(mydf), 5, F) #no replacement
s
[1] 18 1 12 15 6
mydf[s,]
# test set
mydf[-s, ]
# Option 2
s2 <- rbinom(nrow(mydf), 1, 0.2)
s2
[1] 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0
mydf[s2,]
mydf[-s2,]
library(car)
Attaching package: ‘car’
The following object is masked from ‘package:dplyr’:
recode
b <- 1:20
#h <- recode(b, "1:5=1: 6:10=2; else=NA") # incredibly this creates an error
e <- recode(b, "1:5=1; 6:10=2; else=NA")
e
[1] 1 1 1 1 1 2 2 2 2 2 NA NA NA NA NA NA NA NA NA NA
f <- recode(b, "lo:5=1; 6:10=2; 11:15=3; 16:hi=4; else=NA")
f
[1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3 4 4 4 4 4
e <- recode(h, "NA=99")
e
[1] 1 2 3 4 5 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99
# Reconding on multiple variables
i <- expand.grid(1:4, 1:2)
i
interaction(i$Var1, i$Var2) # creates all possible combinations
[1] 1.1 2.1 3.1 4.1 1.2 2.2 3.2 4.2
Levels: 1.1 2.1 3.1 4.1 1.2 2.2 3.2 4.2
set.seed(1)
n <- 30
mydf <- data.frame(x1 = rbinom(n, 1, 0.5), x2 = rbinom(n, 1, 0.1), x3 = rbinom(n,
1, 0.5), x4 = rbinom(n, 1, 0.8), x5 = 1, x6 = sample(c(0, 1, NA), n, TRUE))
str(mydf)
'data.frame': 30 obs. of 6 variables:
$ x1: int 0 0 1 1 0 1 1 1 1 0 ...
$ x2: int 0 0 0 0 0 0 0 0 0 0 ...
$ x3: int 1 0 0 0 1 0 0 1 0 1 ...
$ x4: int 1 1 1 0 1 1 1 1 0 1 ...
$ x5: num 1 1 1 1 1 1 1 1 1 1 ...
$ x6: num NA 1 1 0 NA 1 1 0 0 1 ...
mydf$x1 + mydf$x2 - mydf$x3 # vector operations
[1] -1 0 1 1 -1 1 1 0 1 -1 0 -1 1 0 1 -1 0 1 -1 0 1 -1 1 0 -1 0 -1 0 1
[30] 0
with(mydf, x1+x2-x3)
[1] -1 0 1 1 -1 1 1 0 1 -1 0 -1 1 0 1 -1 0 1 -1 0 1 -1 1 0 -1 0 -1 0 1
[30] 0
rowSums(mydf)
[1] NA 3 4 2 NA 4 4 4 2 4 3 3 3 2 NA 4 5 4 NA 5 NA 4 3 2 NA 3 3 NA 3
[30] NA
rowSums(mydf, na.rm = T)
[1] 3 3 4 2 3 4 4 4 2 4 3 3 3 2 3 4 5 4 2 5 2 4 3 2 3 3 3 2 3 2
data.frame(1:n, rowSums(mydf, na.rm = T))
rowMeans(mydf)
[1] NA 0.5000 0.6667 0.3333 NA 0.6667 0.6667 0.6667 0.3333 0.6667 0.5000 0.5000
[13] 0.5000 0.3333 NA 0.6667 0.8333 0.6667 NA 0.8333 NA 0.6667 0.5000 0.3333
[25] NA 0.5000 0.5000 NA 0.5000 NA
apply(mydf, 1, var) # 2nd argument: 1 for rows, 2 for columns, c(1, 2) rows & columns.
[1] NA 0.3000 0.2667 0.2667 NA 0.2667 0.2667 0.2667 0.2667 0.2667 0.3000 0.3000
[13] 0.3000 0.2667 NA 0.2667 0.1667 0.2667 NA 0.1667 NA 0.2667 0.3000 0.2667
[25] NA 0.3000 0.3000 NA 0.3000 NA
apply(mydf, 2, var)
x1 x2 x3 x4 x5 x6
0.2575 0.0000 0.2483 0.1437 0.0000 NA
sapply(mydf, var) # over list or vector
x1 x2 x3 x4 x5 x6
0.2575 0.0000 0.2483 0.1437 0.0000 NA
newvar
[1] 3 2 0 0 3 0 0 1 0 3 2 3 0 1 0 3 1 0 2 1 0 3 0 2 3 2 3 2 0 2
newvar[mydf$x1 == 1] <- with(mydf, x2 + x3)
number of items to replace is not a multiple of replacement length
# Matrices
set.seed(1)
a <- rnorm(100)
quantile(a, c(0.025, 0.975))
2.5% 97.5%
-1.671 1.797
quantile(a, seq(0, 1, by = 0.1))
0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
-2.2147 -1.0527 -0.6139 -0.3753 -0.0767 0.1139 0.3771 0.5812 0.7713 1.1811 2.4016
summary(as.logical(rbinom(1000, 1, 0.5)))
Mode FALSE TRUE
logical 492 508
summary(factor(a))
-2.2146998871775 -1.98935169586337 -1.80495862889104 -1.52356680042976
1 1 1 1
-1.47075238389927 -1.37705955682861 -1.27659220845804 -1.2536334002391
1 1 1 1
-1.22461261489836 -1.12936309608079 -1.04413462631653 -0.934097631644252
1 1 1 1
-0.835628612410047 -0.820468384118015 -0.743273208882405 -0.709946430921815
1 1 1 1
-0.70749515696212 -0.68875569454952 -0.626453810742332 -0.621240580541804
1 1 1 1
-0.612026393250771 -0.589520946188072 -0.573265414236886 -0.568668732818502
1 1 1 1
-0.54252003099165 -0.47815005510862 -0.473400636439312 -0.443291873218433
1 1 1 1
-0.41499456329968 -0.394289953710349 -0.367221476466509 -0.305388387156356
1 1 1 1
-0.304183923634301 -0.253361680136508 -0.164523596253587 -0.155795506705329
1 1 1 1
-0.135178615123832 -0.135054603880824 -0.112346212150228 -0.102787727342996
1 1 1 1
-0.0593133967111857 -0.0561287395290008 -0.0538050405829051 -0.0449336090152309
1 1 1 1
-0.0392400027331692 -0.0161902630989461 0.00110535163162413 0.0280021587806661
1 1 1 1
0.0743413241516641 0.0745649833651906 0.153253338211898 0.183643324222082
1 1 1 1
0.188792299514343 0.267098790772231 0.291446235517463 0.329507771815361
1 1 1 1
0.332950371213518 0.341119691424425 0.36458196213683 0.370018809916288
1 1 1 1
0.387671611559369 0.389843236411431 0.398105880367068 0.417941560199702
1 1 1 1
0.475509528899663 0.487429052428485 0.556663198673657 0.558486425565304
1 1 1 1
0.569719627442413 0.575781351653492 0.593901321217509 0.593946187628422
1 1 1 1
0.610726353489055 0.61982574789471 0.689739362450777 0.696963375404737
1 1 1 1
0.700213649514998 0.738324705129217 0.763175748457544 0.768532924515416
1 1 1 1
0.782136300731067 0.821221195098089 0.881107726454215 0.918977371608218
1 1 1 1
0.943836210685299 1.06309983727636 1.10002537198388 1.12493091814311
1 1 1 1
1.16040261569495 1.1780869965732 1.20786780598317 1.35867955152904
1 1 1 1
1.43302370170104 1.46555486156289 1.51178116845085 1.58683345454085
1 1 1 1
1.59528080213779 1.98039989850586 2.17261167036215 2.40161776050478
1 1 1 1
# Tables
set.seed(1)
a <- sample(1:5, 25, T)
a
[1] 2 2 3 5 2 5 5 4 4 1 2 1 4 2 4 3 4 5 2 4 5 2 4 1 2
table(a)
a
1 2 3 4 5
3 8 2 7 5
prop.table(table(a)) # to obtain percentages
a
1 2 3 4 5
0.12 0.32 0.08 0.28 0.20
prop.table(table(a)) *100
a
1 2 3 4 5
12 32 8 28 20
cbind(table(a), prop.table(table(a))*100)
[,1] [,2]
1 3 12
2 8 32
3 2 8
4 7 28
5 5 20
# multi-variate
b <- rep(c(1, 2), length = 25)
table(a, b)
b
a 1 2
1 0 3
2 5 3
3 1 1
4 5 2
5 2 3
c <- rep(c(3, 4, 5), length = 25)
table(a, b, c)
, , c = 3
b
a 1 2
1 0 1
2 3 1
3 0 1
4 1 0
5 1 1
, , c = 4
b
a 1 2
1 0 0
2 2 2
3 0 0
4 2 2
5 0 0
, , c = 5
b
a 1 2
1 0 2
2 0 0
3 1 0
4 2 0
5 1 2
ftable(a, c, c)
c 3 4 5
a c
1 3 1 0 0
4 0 0 0
5 0 0 2
2 3 4 0 0
4 0 4 0
5 0 0 0
3 3 1 0 0
4 0 0 0
5 0 0 1
4 3 1 0 0
4 0 4 0
5 0 0 2
5 3 2 0 0
4 0 0 0
5 0 0 3
xtabs(~a + b)
b
a 1 2
1 0 3
2 5 3
3 1 1
4 5 2
5 2 3
x <- table(a, b)
addmargins(x)
b
a 1 2 Sum
1 0 3 3
2 5 3 8
3 1 1 2
4 5 2 7
5 2 3 5
Sum 13 12 25
prop.table(table(a, b), 1) # proportions by rows
b
a 1 2
1 0.0000000 1.0000000
2 0.6250000 0.3750000
3 0.5000000 0.5000000
4 0.7142857 0.2857143
5 0.4000000 0.6000000
prop.table(table(a, b), 2)
b
a 1 2
1 0.00000000 0.25000000
2 0.38461538 0.25000000
3 0.07692308 0.08333333
4 0.38461538 0.16666667
5 0.15384615 0.25000000
# Correlations
set.seed(1)
n <- 1000
x1 <- rnorm(n, -1, 10)
x2 <- rnorm(n, 3, 2)
y <- 5 * x1 + x2 + rnorm(n, 1, 2)
cor(x1, x2)
[1] 0.006401211
cor.test(x1, x2)
Pearson's product-moment correlation
data: x1 and x2
t = 0.20223, df = 998, p-value = 0.8398
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.05561394 0.06836716
sample estimates:
cor
0.006401211
cor(cbind(x1, x2, y))
x1 x2 y
x1 1.000000000 0.006401211 0.99837564
x2 0.006401211 1.000000000 0.04731214
y 0.998375645 0.047312138 1.00000000
a <- rnorm(n)
b <- a^2 + rnorm(n)
plot(b~a)

plot(b ~ a, col = "gray")
curve((x), col = "red", add = TRUE)
curve((x^2), col = "blue", add = TRUE)

cor(a, b)
[1] -0.01712362
cor(a^2, b)
[1] 0.8430192
plot(b~I(a^2), col = "orange")
abline(lm(b~I(a^2)), col = "red")

layout(matrix(1:2, nrow = 1))
plot(b ~ a, col = "gray")
curve((x^2), col = "blue", add = TRUE)
plot(b ~ I(a^2), col = "gray")
curve((x), col = "blue", add = TRUE)

# Rounding
height <- c(167, 164, 172, 158, 181, 179)
mean(height)
[1] 170.1667
signif(mean(height), 4)
[1] 170.2
round(mean(height), 1)
[1] 170.2
round(mean(height), -2)
[1] 200
options(digits = 5)
sd(height)
[1] 8.8863
options(digits = 2)
sd(height)
[1] 8.9
options(scipen = -10)
10000000
[1] 1e+07
# sprintf
sprintf("%.3f", pi)
[1] "3.142"
sprintf("%05.1f", pi)
[1] "003.1"
# Plots as data summary
set.seed(1)
a <- rnorm(30)
hist(a, col = "gray20", border = "lightgray")

density(a)
Call:
density.default(x = a)
Data: a (30 obs.); Bandwidth 'bw' = 0.3891
x y
Min. :-3.4e+00 Min. :0.0e+00
1st Qu.:-1.8e+00 1st Qu.:3.0e-02
Median :-3.0e-01 Median :9.0e-02
Mean :-3.0e-01 Mean :1.6e-01
3rd Qu.: 1.2e+00 3rd Qu.:2.9e-01
Max. : 2.8e+00 Max. :4.6e-01
plot(density(a))

hist(a, freq = FALSE, col = "gray20", border = "lightgray")
lines(density(a), col = "red", lwd = 2)

b <- c(3, 4.5, 5, 8, 3, 6)
barplot(b, names.arg = letters[1:length(b)], horiz = F)

d <- rbind(c(2, 4, 1), c(6, 1, 3))
d
[,1] [,2] [,3]
[1,] 2e+00 4e+00 1e+00
[2,] 6e+00 1e+00 3e+00
barplot(d, names.arg = letters[1:3])

barplot(d, beside = T)

layout(matrix(1:2, nrow = 1))
barplot(b, names.arg = letters[1:6], horiz = TRUE, las = 2)
dotchart(b, labels = letters[1:6], xlim = c(0, 8))

boxplot(a)

e <- rnorm(100, 1, 1)
f <- rnorm(100, 2, 4)
boxplot(e, f)

g1 <- c(e, f)
g2 <- rep(c(1, 2), each = 100)
boxplot(g1 ~ g2)

# Scatterplot
x1 <- rnorm(1000)
x2 <- rnorm(1000)
x3 <- x1 + x2
x4 <- x1 + x3
plot(x1, x2)

plot(x2~x1)

layout(matrix(1:3, nrow = 1))
plot(x1, x2)
plot(x1, x3)
plot(x1, x4)

pairs(~x1 + x2 + x3 + x4)

colors()[1:10]
[1] "white" "aliceblue" "antiquewhite" "antiquewhite1" "antiquewhite2"
[6] "antiquewhite3" "antiquewhite4" "aquamarine" "aquamarine1" "aquamarine2"
length(colors())
[1] 657
colors()[600]
[1] "slategray1"
set.seed(100)
z <- sample(1:4, 100, TRUE)
x <- rnorm(100)
y <- rnorm(100)
plot(x, y, pch = 15, col = c("red", "blue"))

c("red", "blue", "green", "orange")[z]
[1] "blue" "blue" "green" "red" "blue" "blue" "orange" "blue" "green"
[10] "red" "green" "orange" "blue" "blue" "orange" "green" "red" "blue"
[19] "blue" "green" "green" "green" "green" "green" "blue" "red" "orange"
[28] "orange" "green" "blue" "blue" "orange" "blue" "orange" "green" "orange"
[37] "red" "green" "orange" "red" "blue" "orange" "orange" "orange" "green"
[46] "blue" "orange" "orange" "red" "blue" "blue" "red" "red" "blue"
[55] "green" "blue" "red" "red" "green" "red" "blue" "green" "orange"
[64] "green" "blue" "blue" "blue" "blue" "red" "green" "blue" "blue"
[73] "green" "orange" "green" "green" "orange" "orange" "orange" "red" "blue"
[82] "green" "orange" "orange" "red" "green" "green" "red" "blue" "green"
[91] "orange" "red" "blue" "blue" "orange" "blue" "green" "red" "red"
[100] "orange"
plot(x, y, pch = 15, col = c("red", "blue", "green", "orange")[z]) #indexing colors on z groups

# Analysis of variance (ANOVA)
set.seed(100)
tr <- rep(1:4, each = 30)
y <- numeric(length = 120)
y[tr == 1] <- rnorm(30, 5, 1)
y[tr == 2] <- rnorm(30, 4, 2)
y[tr == 3] <- rnorm(30, 4, 5)
y[tr == 4] <- rnorm(30, 1, 2)
aov(y~tr)
Call:
aov(formula = y ~ tr)
Terms:
tr Residuals
Sum of Squares 2.5e+02 1.2e+03
Deg. of Freedom 1.0e+00 1.2e+02
Residual standard error: 3.2e+00
Estimated effects may be unbalanced
summary(aov(y ~ factor(tr)))
Df Sum Sq Mean Sq F value Pr(>F)
factor(tr) 3.00e+00 2.97e+02 9.9e+01 9.98e+00 6.6e-06 ***
Residuals 1.16e+02 1.15e+03 9.9e+00
---
Signif. codes: 0e+00 ‘***’ 1e-03 ‘**’ 1e-02 ‘*’ 5e-02 ‘.’ 1e-01 ‘ ’ 1e+00
oneway.test(y ~ tr)
One-way analysis of means (not assuming equal variances)
data: y and tr
F = 4e+01, num df = 3e+00, denom df = 5e+01, p-value = 1e-13
oneway.test(y ~ factor(tr), var.equal = TRUE)
One-way analysis of means
data: y and factor(tr)
F = 1e+01, num df = 3e+00, denom df = 1e+02, p-value = 7e-06
by(y, tr, FUN = mean)
tr: 1
[1] 5e+00
------------------------------------------------------------------------------------
tr: 2
[1] 4.2e+00
------------------------------------------------------------------------------------
tr: 3
[1] 3.8e+00
------------------------------------------------------------------------------------
tr: 4
[1] 8.5e-01
tapply(y, tr, FUN = mean) # same thing
1 2 3 4
5.0e+00 4.2e+00 3.8e+00 8.5e-01
# Distributions
options(scipen = F)
options(digits = 5)
dnorm(0) # density
[1] 0.39894
dnorm(0, mean=-1)
[1] 0.24197
pnorm(0) # cumulative
[1] 0.5
pnorm(1.65) # 95% normal confidence interval
[1] 0.95053
pnorm(1.96)
[1] 0.975
pnorm(1.96) - pnorm(-1.96)
[1] 0.95
qnorm(c(0.025, 0.975)) # quantile
[1] -1.96 1.96
pnorm(qnorm(c(0.025, 0.975)))
[1] 0.025 0.975
# other distribution
dbinom(0, 1, 0.5)
[1] 0.5
pbinom(0, 1, 0.5)
[1] 0.5
qbinom(.95, 1, 0.5)
[1] 1
# Formulae
myformula <- ~x
class(myformula)
[1] "formula"
# interactions
y ~ x1 * x2
y ~ x1 * x2
# As strings
("y ~ x") == (y ~ x)
[1] TRUE
as.formula("y~x")
y ~ x
as.character(y ~ x)
[1] "~" "y" "x"
terms(y ~ x1 + x2)
y ~ x1 + x2
attr(,"variables")
list(y, x1, x2)
attr(,"factors")
x1 x2
y 0 0
x1 1 0
x2 0 1
attr(,"term.labels")
[1] "x1" "x2"
attr(,"order")
[1] 1 1
attr(,"intercept")
[1] 1
attr(,"response")
[1] 1
attr(,".Environment")
<environment: R_GlobalEnv>
update(y ~ x, ~. + x2)
y ~ x + x2
update(y ~ x, z ~ .)
z ~ x
# Bivariate Regression
set.seed(1)
bin <- rbinom(1000, 1, 0.5)
out <- 2 * bin + rnorm(1000)
by(out, bin, mean)
bin: 0
[1] -0.015881
---------------------------------------------------------------------
bin: 1
[1] 1.9662
t.test(out ~ bin)
Welch Two Sample t-test
data: out by bin
t = -30.3, df = 993, p-value <2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-2.1105 -1.8537
sample estimates:
mean in group 0 mean in group 1
-0.015881 1.966237
lm(out ~ bin)
Call:
lm(formula = out ~ bin)
Coefficients:
(Intercept) bin
-0.0159 1.9821
summary(lm(out~bin))$coef
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.015881 0.045341 -0.35027 7.2621e-01
bin 1.982119 0.065444 30.28724 1.9487e-143
plot(out ~ bin, col = "gray")
points(0:1, by(out, bin, mean), col = "blue", bg = "blue", pch = 23)
abline(coef(lm(out ~ bin)), col = "blue")

set.seed(1)
x <- runif(1000, 0, 10)
y <- 3 * x + rnorm(1000, 0, 5)
# glm plots
set.seed(1)
n <- 100
x <- runif(n, 0, 1)
y <- rbinom(n, 1, x)
plot(y ~ x, col = NULL, bg = rgb(0, 0, 0, 0.5), pch = 21) # bg: background color
abline(lm(y ~ x), lwd = 2) # lwd: line width (default: 1)

m1 <- glm(y ~ x, family = binomial(link = "logit"))
newdf <- data.frame(x = seq(0, 1, length.out = 100))
newdf
newdf$pout_logit <- predict(m1, newdf, se.fit = TRUE, type = "response")$fit
newdf[95:100,]
# build confidence intervals from standard error
newdf$pse_logit <- predict(m1, newdf, se.fit = TRUE, type = "response")$se.fit
newdf$plower_logit <- newdf$pout_logit - (1.96 * newdf$pse_logit) # 95% CI lower bound
newdf$pupper_logit <- newdf$pout_logit + (1.96 * newdf$pse_logit) # 95% CI upper bound
# qnorm(c(0.025, 0.975)) = (-1.96, +1.96)
newdf[,c(1,2,3,5,4)]
# now plot predicted values
with(newdf, plot(pout_logit ~ x, type = "l", lwd = 2))
with(newdf, lines(pupper_logit ~ x, type = "l", lty = 2))
with(newdf, lines(plower_logit ~ x, type = "l", lty = 2))


head(mpg)
#g <- ggplot(data = mpg, aes(x = displ, y = hwy))
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point()

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + geom_smooth(method = "lm") # with regression

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point(aes(color = class))

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point(aes(size = class))

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point(aes(shape = class))

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point(aes(alpha = class))

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + facet_grid(. ~ cyl)

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + facet_grid(drv ~ .)

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + facet_grid(drv ~ cyl)

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + facet_wrap( ~ class)

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + geom_smooth()

ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + geom_smooth(se = FALSE) # Turn off confidence band

---
title: "R Notebook"
output:
  html_document: default
  html_notebook: default
---

```{r}
A <- 1:5
B <- 11:15
names(A) <- A
names(B) <- B
```

```{r}
A
B
```

```{r}
View(anscombe)
lm(y3~x3, data = anscombe)
```

```{r}
##-- now some "magic" to do the 4 regressions in a loop:
ff <- y ~ x
mods <- setNames(as.list(1:4), paste0("lm", 1:4))
for(i in 1:4) {
  ff[2:3] <- lapply(paste0(c("y","x"), i), as.name)
  ## or   ff[[2]] <- as.name(paste0("y", i))
  ##      ff[[3]] <- as.name(paste0("x", i))
  mods[[i]] <- lmi <- lm(ff, data = anscombe)
  print(anova(lmi))
}
```

```{r}
## See how close they are (numerically!)
sapply(mods, coef)
lapply(mods, function(fm) coef(summary(fm)))

```

```{r}
## Now, do what you should have done in the first place: PLOTS
op <- par(mfrow = c(2, 2), mar = 0.1+c(4,4,1,1), oma =  c(0, 0, 2, 0))
for(i in 1:4) {
  ff[2:3] <- lapply(paste0(c("y","x"), i), as.name)
  plot(ff, data = anscombe, col = "red", pch = 21, bg = "orange", cex = 1.2,
       xlim = c(3, 19), ylim = c(3, 13))
  abline(mods[[i]], col = "blue")
}
mtext("Anscombe's 4 Regression data sets", outer = TRUE, cex = 1.5)
par(op)
```

```{r}
x = c(TRUE, FALSE, FALSE)
typeof(x)  #logical (vector)
mode(x)
storage.mode(x)
y = 1:10
typeof(y)
mode(y)
storage.mode(y)
```

```{r}
dim(y)
length(y)
```

```{r}
z= list(1, TRUE, 'safs') #trying to get a list
typeof(z)
class(z)
z[3]
length(z)
dim(z)

```

```{r}
quote(x+y)
as.list(quote(x + y))
```

```{r}
1e3L #create constant
1L
```

```{r}
cat(1+2)
'+'(1, 2)
```

```{r}
x <- options()
x$prompt
```

```{r}
match(NA, NaN)
match(NA, NA)
match(NaN, NaN)
```

```{r}
x = array(1:8, c(2,4))
x
i=2
j=3
x[i]
x[i, j]
x[[i]]
x[[i, j]]
```

```{r}
rownames(x)=c("a","b")
x
x = as.data.frame(x)
x
x["a",]
x[]
```

```{r}
i <- matrix(1:4, 2, byrow = TRUE)
i
```

```{r}
i[2,]
i[2, ,drop=FALSE] # keeping dimension 1 * n when selection a row
dim(i[2,])
dim(i[2, ,drop=FALSE])
```

```{r}
# https://cran.r-project.org/doc/manuals/R-intro.pdf

help.start()
```

```{r}
x <- rnorm(10000)
y <- rnorm(x)
plot(x, y)
hist(y)
```

```{r}
ls()
```

```{r}
rm(list=ls())
ls()
```

```{r}
x <- 1:20
w <- 1 + sqrt(x)/2
dummy <- data.frame(x=x, y= x + rnorm(x)*w)
dummy 
```

```{r}
# 4 Ordered and unordered factors
state <- c("tas", "sa", "qld", "nsw", "nsw", "nt", "wa", "wa",
"qld", "vic", "nsw", "vic", "qld", "qld", "sa", "tas",
"sa", "nt", "wa", "vic", "qld", "nsw", "nsw", "wa",
"sa", "act", "nsw", "vic", "vic", "act")
statef <- factor(state)
statef
```

```{r}
levels(statef)
```

```{r}
incomes <- c(60, 49, 40, 61, 64, 60, 59, 54, 62, 69, 70, 42, 56,
61, 61, 61, 58, 51, 48, 65, 49, 49, 41, 48, 52, 46,
59, 46, 58, 43)
incmeans <- tapply(incomes, statef, mean)
incmeans
```

```{r}
# Arrays
a <- array(1:30, dim=c(2, 5,3))
a
```

```{r}
x <- array(1:20, dim=c(4,5)) # Generate a 4 by 5 array.
x
i <- array(c(1:3,3:1), dim=c(3,2))
i
x[i] #get the 3 elements shown by i: (3, 1), (2, 2) and (1, 3)
```

```{r}
help("<-")
```

```{r}
# Back to http://zoonek2.free.fr/UNIX/48_R/02.html
x <- rnorm(10)
x
sort(x)
order(x)
x[order(x)]
```

```{r}
x <- sample(1:5, 10, replace=T)
x
x[order(x)]
unique(x)
```

```{r}
seq(0,10, length=11) == seq(0,10, by=1)
```

```{r}
# Rep
rep(0, 10)
rep(1:5,3)
rep(1:5, each=3)
rep(1:5,2,each=3)
```

```{r}
# Factor
x <- factor( sample(c("Yes", "No", "Perhaps"), 5, replace=T) )
x
```

```{r}
# specify levels
l <- c("Yes", "No", "Perhaps")
x <- factor( sample(l, 5, replace=T), levels=l )
x
```

```{r}
table(x)
```

```{r}
# gl: Generate Factor Levels
gl(1,4)
gl(2,4)
gl(2,1,8)
gl(2,1,8, labels=c(T,F))
```

```{r}
x <- gl(2,4)
x
y <- gl(2,1,8)
y
interaction(x,y)
data.frame(x,y, int=interaction(x,y))
```

```{r}
# Cartesian product (toutes les possibilites)
x <- c("A", "B", "C")
y <- 1:2
z <- c("a", "b")
expand.grid(x,y,z)
```

```{r}
x <- factor(c(3,4,5,1))
as.numeric( levels(x))
as.numeric( levels(x)[ x ] ) # proper way to convert from factor to numeric
```

```{r}
# Data Frames
n <- 10
df <- data.frame( x=rnorm(n), y=sample(c(T,F),n,replace=T) )
str(df)
```

```{r}
summary(df)
```

```{r}
names(df);cat(rownames(df))
```

```{r}
# Merge
merge(x, y)                 # INNER JOIN
merge(x, y, all.x = TRUE)   # LEFT JOIN
merge(x, y, all.y = TRUE)   # RIGHT JOIN
merge(x, y, all   = TRUE)   # OUTER JOIN
#merge(a, b, by=c("y", "z")) # specify what to merge on
```

```{r}
# Regression
data(cars) 
#View(cars)

# Regression
lm.fit=lm( dist ~ speed, data=cars, na.action = na.exclude)
lm.fit
# Polynomial regression
lm.fit3 = lm( dist ~ poly(speed,3), data=cars)

#plot(lm.fit)
plot(cars$speed, cars$dist)
abline(lm.fit)
```

```{r}
# Lists
h <- list()
h[["foo"]] <- 1
h[["bar"]] <- c("a", "b", "c")
h["bar"] == h[["bar"]] #h["bar"] is a list containing the vector
```

```{r}
# Delete
h[["bar"]] <- NULL
```

```{r}
m <- matrix( c(1,2,3,4), nrow=2 )
m
```

```{r}
solve(m)
x <- matrix( c(6,7), nrow=2 )
solve(m, x)
```

```{r}
?solve
```

```{r}
n <- 1000
x1 <- factor( sample(1:3, n, replace=T), levels=1:3 )
x2 <- factor( sample(LETTERS[1:5], n, replace=T), levels=LETTERS[1:5] )
x3 <- factor( sample(c(F,T),n,replace=T), levels=c(F,T) )
d <- data.frame(x1,x2,x3)
r <- table(d)
```

```{r}
r
```

```{r}
ftable(d) #easier reading
```

```{r}
# contingency table into a data.frame
n <- 100
k <- 10
x <- factor( sample(LETTERS[1:k], n, replace=T), levels=LETTERS[1:k] )
x
d <- table(x)
x2 = factor( rep(names(d),d), levels=names(d) )
x2
```

```{r}
# apply
options(digits=4)
df <- data.frame(x=rnorm(20),y=rnorm(20),z=rnorm(20))
apply(df,2,mean)
rownames(df) <- LETTERS[1:20]
apply(df, 1, mean)
```

```{r}
gl(2,10,20)
tapply(1:20, gl(2,10,20), sum) # tapply: 2nd argument used for grouping

by(1:20, gl(2,10,20), sum)
```

```{r}
x <- list(a=rnorm(10), b=runif(100), c=rgamma(50,1))
sapply(x,sd) # sapply: apply FUN on each element of vector
lapply(x,sd) # lapply: same but returns list

```

```{r}
# Exercise: Let x be a boolean vector. Count the number of sequences ("runs") of zeros (for instance, in 00101001010110, there are 6 runs: 00 0 00 0 0 0). Count the number of sequences of 1. Counth the total number of sequences. Same question for a factor with more tham two levels.
n <- 50
x <- sample(0:1, n, replace=T, p=c(.2,.8))
x
diff(x, lag=1)


```

```{r}
#Let r be the return of a financial asset. The clustered return is the accumulated return for a sequence of returns of the same sign. The trend number is the number of steps in such a sequence. The average return is their ratio. Compute these quantities.
data(EuStockMarkets)
x <- EuStockMarkets
# We aren't interested in the spot prices, but in the returns
# return[i] = ( price[i] - price[i-1] ) / price[i-1]
y <- apply(x, 2, function (x) { diff(x)/x[-length(x)] })
# We normalize the data
z <- apply(y, 2, function (x) { (x-mean(x))/sd(x) })
# A single time series
r <- z[,1]
# The runs
f <- factor(cumsum(abs(diff(sign(r))))/2)
r <- r[-1]
accumulated.return <- tapply(r, f, sum)
trend.number <- table(f)
boxplot(abs(accumulated.return) ~ trend.number, col='pink',
        main="Accumulated return")
```

```{r}
# Strings
print("Hello\n")

cat("Hello\n") #use cat
```

```{r}
paste("Hello", "World", "!", sep="") #concatenate
paste("Hello ", " World", "!", sep="")
```

```{r}
x <- 5
paste("x=", x)
cat("x=", x, "\n", sep="\n")
```

```{r}
s <- c("Hello", " ", "World", "!")
paste(s)
paste(s, sep="")
paste(s, collapse="")
```

```{r}
paste(1:3, "Hello World!", sep=":")
```

```{r}
nchar("Hello World!")
```

```{r}
s <- "Hello World"
substring(s, 4, 6)
```

```{r}
s <- "foo-->bar-->baz"
strsplit(s, "-->")
```

```{r}
# Regex
s <- "foo, bar, baz"
strsplit(s, ", *")
```

```{r}
s <- apply(matrix(LETTERS[1:24], nr=4), 2, paste, collapse="")
s
```

```{r}
grep("O", s)
grep("O", s, value=T)
```

```{r}
regexpr("o", "Hello")
```

```{r}
regexpr("o", c("Hello", "World!"))
```

```{r}
s <- "foo    bar baz"
gsub(" ", "", s)   # Remove all the spaces
gsub(" +", " ", s)  # Remove multiple spaces and replace them by single spaces

```

```{r}
#The "sub" is similar to "gsub" but only replaces the first occurrence.
s <- "foo bar baz"
sub(" ", "", s)
```

```{r}
# Dates
as.Date("2005-05-15") #ISO 8601
```

```{r}
as.Date("15/05/2005", format="%d/%m/%Y")
as.Date("15/05/05", format="%d/%m/%y")
cat("\n")
as.Date("01/02/03", format="%y/%m/%d")
as.Date("01/02/03", format="%y/%d/%m")

```

```{r}
as.Date("01/02/03", format="%y/%m/%d") - as.Date("01/02/03", format="%y/%d/%m")
```

```{r}
Sys.Date()
format(Sys.Date(), format="%A, %d %B %Y")
```

```{r}
seq(as.Date("2005-01-01"), as.Date("2005-07-01"), by="month")
seq(as.Date("2005-01-01"), as.Date("2005-07-01"), by=31)
```

```{r}
methods(class="Date")
```

```{r}
as.POSIXlt("2005-05-15 21:45:17")
```

```{r}
as.POSIXct(Sys.Date())
```

```{r}
# Reading from dataframes
# option 1
  #d <- read.table("foo.txt")
  #d$Date <- as.Date( as.character( d$Date ) )
# option 2
  #read.table("foo.txt", colClasses=c("Date", "character", rep(10, "numeric")))
```

```{r}

```

```{r}
options(warn=1)
```

```{r}
methods(plot)
```

```{r}
# Import 

# d <- read.table("foo.txt", header=T, sep=",")
# d <- read.csv("txt.csv")
# d <- read.csv2("txt.csv")  # semicolon-separated file, with a
#                            # comma instead of the decimal point.
# d <- read.delim("foo.txt") # Tab-delimited file
# d <- read.fwf("txt.fwf")   # Fixed width fields
```

```{r}
# Excel: this may be trickier: the missing values often appear as "#N/A!" and are mistaken for the start of a comment... You can try

# d <- read.table("foo.csv", header = TRUE, sep = ",", 
#                 na.strings = c("#N/A!", "NA", "@NA"), 
#                 quote = '"',
#                 comment.char = "")
```

```{r}
#If your file only contains number, or only strings, it is wiser to store it in a matrix, not a data.frame. This is what the "scan" function does.
# A numeric matrix
  # x <- scan("foo.txt", sep=",")  # Gives a numeric vector
  # n <- scan("foo.txt", sep=",", nlines=1)
  # x <- matrix(x, nc=n)

# A vector of strings
  #x <- scan("foo.txt", what=character(0))

```

```{r}
# Back tohttps://cran.r-project.org/doc/manuals/R-intro.pdf - Regression

  # fm05 <- lm(y ~ x1 + x2 + x3 + x4 + x5, data = production)
  # fm6 <- update(fm05, . ~ . + x6)
  # smf6 <- update(fm6, sqrt(.) ~ .)
```

```{r}
# Spine (from help)
require(graphics)

op <- par(mfrow = c(2,1), mgp = c(2,.8,0), mar = 0.1+c(3,3,3,1))
n <- 9
x <- 1:n
y <- rnorm(n)
plot(x, y, main = paste("spline[fun](.) through", n, "points"))
lines(spline(x, y))
lines(spline(x, y, n = 210), col = 2)
```

```{r}
# NA handling - http://thomasleeper.com/Rcourse/Tutorials/NA.html
g1 <- c(1, 2, NA, NA, NA, 6, 7)
g2 <- na.omit(g1)
g2
```

```{r}
attributes(g2)$na.action
```

```{r}
sum(g1)
sum(g1, na.rm = TRUE)
```

```{r}
# Cor -> can eliminate only pair-wise NAs ()
x <- c(1, 2, 3, NA, 5, 7, 9)
y <- c(3, 2, 4, 5, 1, 3, 4)
z <- c(NA, 2, 3, 5, 4, 3, 4)
m <- data.frame(x, y, z)
m
```

```{r}
cor(m)  # returns all NAs
```

```{r}
cor(m, use = "complete.obs") # Default, all records with NA removed
```

```{r}
cor(m, use = "pairwise.complete.obs") #kept more records for y~x and y~z
```

```{r}
# Defaut for lm is also casewise deletion
lm <- lm(y ~ x + z, data = m)
summary(lm)
```

```{r}
# Checking length of data used for regression
length(m$y)
length(lm$fitted)
```

```{r}
# checking where missing data is
is.na(m) # only useful for small examples
# image
image(is.na(m), main = "Missing Values", xlab = "Observation", ylab = "Variable", 
    xaxt = "n", yaxt = "n", bty = "n")
axis(1, seq(0, 1, length.out = nrow(m)), 1:nrow(m), col = "white")
axis(2, c(0, 0.5, 1), names(m), col = "white", las = 2)
```

```{r}
# to remove casewise:
m2 <- na.omit(m) # use new variable to keep original dataframe
m
m2
```

```{r}
# Mean imputation
x2 <- x
x2[is.na(x2)] <- mean(x2, na.rm = TRUE)
x
x2
```

```{r}
# Random imputation - conserve mean and variance. 
# How: sample rest of the values to fill NAs
x3 <- x
x3[is.na(x3)] <- sample(x3[!is.na(x3)], sum(is.na(x3)), TRUE)
x
x3
```

```{r}
# Saving R data http://thomasleeper.com/Rcourse/Tutorials/savingdata.html 
set.seed(1)
mydf <- data.frame(x = rnorm(10), y = rnorm(10), z = rnorm(10))
```

```{r}
save(mydf, file = "saveddf.RData") # can be loaded by double-clicking on saved file
```

```{r}
unlink("saveddf.RData") # removing file
```

```{r}
# dput to have a readable format (e.g. for stack overflow)
dput(mydf)
```

```{r}
dput(mydf, "saveddf.txt")
```

```{r}
mydf2 <- dget("saveddf.txt")
mydf2
```

```{r}
mydf==mydf2 # due to rounding
```

```{r}
unlink("saveddf.text")
```

```{r}
# csv
write.csv(mydf, file = "saveddf.csv")
unlink("savedf.csv")
```

```{r}
# Dataframe rearrangement

set.seed(50)
mydf <- data.frame(a = rep(1:2, each = 10), b = rep(1:4, times = 5), c = rnorm(20), 
    d = rnorm(20), e = sample(1:20, 20, FALSE))
head(mydf)
```

```{r}
# manual order change

head(mydf[, c("c", "d", "e", "a", "b")])
# mydf <- mydf[, c(3, 4, 5, 1, 2)]
```

```{r}
# using order
order(mydf$e)
head(mydf[order(mydf$e), ]) # ordering on any column
```

```{r}
# Subset

mydf[mydf$a == 1, ]
mydf[mydf$a == 1 & mydf$b > 2, ]

subset(mydf, a == 1 & b > 2)
subset(mydf, select = c("a", "b"))
subset(mydf, a == 1 & b > 2, select = c("c", "d")) # filter rows & columns

```

```{r}
# Splitting 

split(mydf, mydf$a) # -> splitting according to values of a
```

```{r}
split(mydf, list(mydf$a, mydf$b))
```

```{r}
lapply(split(mydf, mydf$a), summary) # perform summary on each of the dataframes
```

```{r}
# sampling: splitting into training and test set
# Option 1
s <- sample(1:nrow(mydf), 5, F) #no replacement
s
```

```{r}
# use 5 rows as training set
mydf[s,]
```

```{r}
# test set
mydf[-s, ]
```

```{r}
# Option 2
s2 <- rbinom(nrow(mydf), 1, 0.2) #won't necessarily give exactly 5 rows
s2
```

```{r}
mydf[s2,]
mydf[-s2,]
```

```{r}
# Recoding vectors
library(car)
```

```{r}
b <- 1:20
#h <- recode(b, "1:5=1: 6:10=2; else=NA") # incredibly this creates an error
e <- recode(b, "1:5=1; 6:10=2; else=NA")
e
f <- recode(b, "lo:5=1; 6:10=2; 11:15=3; 16:hi=4; else=NA")
f
```

```{r}
e <- recode(h, "NA=99")
e
```

```{r}
# Reconding on multiple variables
i <- expand.grid(1:4, 1:2)
i
```

```{r}
interaction(i$Var1, i$Var2) # creates all possible combinations
```

```{r}
# Scaling
set.seed(1)
n <- 30
mydf <- data.frame(x1 = rbinom(n, 1, 0.5), x2 = rbinom(n, 1, 0.1), x3 = rbinom(n, 
    1, 0.5), x4 = rbinom(n, 1, 0.8), x5 = 1, x6 = sample(c(0, 1, NA), n, TRUE))
```

```{r}
str(mydf)
```

```{r}
mydf$x1 + mydf$x2 - mydf$x3 # vector operations
with(mydf, x1+x2-x3) # with to indicate dataframe

```

```{r}
rowSums(mydf)
rowSums(mydf, na.rm = T)
data.frame(1:n, rowSums(mydf, na.rm = T))
```

```{r}
rowMeans(mydf)
```

```{r}
apply(mydf, 1, var) # 2nd argument: 1 for rows, 2 for columns, c(1, 2)  rows & columns.
apply(mydf, 2, var)
sapply(mydf, var) # over list or vector
```

```{r}
# adding a variable
newvar <- numeric(nrow(mydf))
newvar[mydf$x1 == 1] <- with(mydf[mydf$x1 == 1, ], x2 + x3)
newvar[mydf$x1 == 0] <- with(mydf[mydf$x1 == 0, ], x3 + x4 + x5)
newvar
```

```{r}
newvar[mydf$x1 == 1] <- with(mydf, x2 + x3) # here different lengths !
```

```{r}
# Matrices
set.seed(1)
a <- rnorm(100)
quantile(a, c(0.025, 0.975))
quantile(a, seq(0, 1, by = 0.1))
```

```{r}
summary(as.logical(rbinom(1000, 1, 0.5)))
```

```{r}
summary(factor(a)) # for factor, returns all value
```

```{r}
# Tables
set.seed(1)
a <- sample(1:5, 25, T)
a
```

```{r}
table(a)
```

```{r}
prop.table(table(a)) # to obtain percentages
prop.table(table(a)) *100
```

```{r}
cbind(table(a), prop.table(table(a))*100)
```

```{r}
# multi-variate
b <- rep(c(1, 2), length = 25)
table(a, b)
```

```{r}
c <- rep(c(3, 4, 5), length = 25)
table(a, b, c)
```

```{r}
ftable(a, c, c) # provides more compact format
```

```{r}
xtabs(~a + b) # right hand formulas same as table
```

```{r}
x <- table(a, b)
addmargins(x)
```

```{r}
prop.table(table(a, b), 1) # proportions by rows
prop.table(table(a, b), 2)
```

```{r}
# Correlations
set.seed(1)
n <- 1000
x1 <- rnorm(n, -1, 10)
x2 <- rnorm(n, 3, 2)
y <- 5 * x1 + x2 + rnorm(n, 1, 2)
```

```{r}
cor(x1, x2)
cor.test(x1, x2)
```

```{r}
cor(cbind(x1, x2, y)) # input is matrix for correlation matrix
```

```{r}
a <- rnorm(n)
b <- a^2 + rnorm(n)
plot(b~a)
```

```{r}
plot(b ~ a, col = "gray")
curve((x), col = "red", add = TRUE)
curve((x^2), col = "blue", add = TRUE)
```

```{r}
cor(a, b)
cor(a^2, b)
```

```{r}
plot(b~I(a^2), col = "orange")
abline(lm(b~I(a^2)), col = "red")
```

```{r}
layout(matrix(1:2, nrow = 1))
plot(b ~ a, col = "gray")
curve((x^2), col = "blue", add = TRUE)
plot(b ~ I(a^2), col = "gray")
curve((x), col = "blue", add = TRUE)
```

```{r}
# Rounding
height <- c(167, 164, 172, 158, 181, 179)
mean(height)
```

```{r}
signif(mean(height), 4)
round(mean(height), 1)
round(mean(height), -2)
```

```{r}
options(digits = 5)
sd(height)
options(digits = 2)
sd(height)
```

```{r}
options(scipen = -10) #positive value to get fixed notation
10000000
```

```{r}
# sprintf
sprintf("%.3f", pi)
sprintf("%05.1f", pi)
```

```{r}
# Plots as data summary
set.seed(1)
a <- rnorm(30)
hist(a, col = "gray20", border = "lightgray")
```

```{r}
density(a)
plot(density(a))
```

```{r}
hist(a, freq = FALSE, col = "gray20", border = "lightgray")
lines(density(a), col = "red", lwd = 2)
```

```{r}
b <- c(3, 4.5, 5, 8, 3, 6)
barplot(b, names.arg = letters[1:length(b)], horiz = F)
```

```{r}
d <- rbind(c(2, 4, 1), c(6, 1, 3))
d
barplot(d, names.arg = letters[1:3])
```

```{r}
barplot(d, beside = T)
```

```{r}
layout(matrix(1:2, nrow = 1))
barplot(b, names.arg = letters[1:6], horiz = TRUE, las = 2)
dotchart(b, labels = letters[1:6], xlim = c(0, 8))
```

```{r}
boxplot(a)
```

```{r}
e <- rnorm(100, 1, 1)
f <- rnorm(100, 2, 4)
boxplot(e, f)
```

```{r}
g1 <- c(e, f)
g2 <- rep(c(1, 2), each = 100)
boxplot(g1 ~ g2)
```

```{r}
# Scatterplot
x1 <- rnorm(1000)
x2 <- rnorm(1000)
x3 <- x1 + x2
x4 <- x1 + x3
```

```{r}
plot(x1, x2)
plot(x2~x1)
```

```{r}
layout(matrix(1:3, nrow = 1))
plot(x1, x2)
plot(x1, x3)
plot(x1, x4)
```

```{r}
pairs(~x1 + x2 + x3 + x4)
```

```{r}
colors()[1:10]
length(colors())
colors()[600]
```

```{r}
set.seed(100)
z <- sample(1:4, 100, TRUE)
x <- rnorm(100)
y <- rnorm(100)
plot(x, y, pch = 15, col = c("red", "blue"))
```

```{r}
c("red", "blue", "green", "orange")[z]
plot(x, y, pch = 15, col = c("red", "blue", "green", "orange")[z]) #indexing colors on z groups
```

```{r}
# Analysis of variance 

set.seed(100)
tr <- rep(1:4, each = 30)
y <- numeric(length = 120)
y[tr == 1] <- rnorm(30, 5, 1)
y[tr == 2] <- rnorm(30, 4, 2)
y[tr == 3] <- rnorm(30, 4, 5)
y[tr == 4] <- rnorm(30, 1, 2)
```

```{r}
aov(y~tr)
```

```{r}
summary(aov(y ~ factor(tr)))
```

```{r}
oneway.test(y ~ tr)
```

```{r}
oneway.test(y ~ factor(tr), var.equal = TRUE)
```

```{r}
by(y, tr, FUN = mean)
```

```{r}
tapply(y, tr, FUN = mean) # same thing
```

```{r}
# Distributions
options(scipen = F)
options(digits = 5)
dnorm(0)  # density
dnorm(0, mean=-1)
```

```{r}
pnorm(0)  # cumulative
pnorm(1.65) # 95% normal confidence interval
pnorm(1.96)
pnorm(1.96) - pnorm(-1.96)
```

```{r}
qnorm(c(0.025, 0.975))  # quantile
pnorm(qnorm(c(0.025, 0.975)))
```

```{r}
# other distribution
dbinom(0, 1, 0.5)
pbinom(0, 1, 0.5)
qbinom(.95, 1, 0.5) # because binomial discrete distribution
```

```{r}
# Formulae
myformula <- ~x
class(myformula)
```

```{r}
# interactions
y ~ x1 * x2
y ~ x1:x2 # without the variables themselves 
y ~ -1 + x1 * x2 # drop the intercept
y ~ x + I(x^2) # without I() R thinks x^2 is a duplicate
```

```{r}
# As strings
("y ~ x") == (y ~ x)
as.formula("y~x")
as.character(y ~ x) # Formula indexed with operand first
```

```{r}
terms(y ~ x1 + x2)
```

```{r}
update(y ~ x, ~. + x2)
update(y ~ x, z ~ .)
```

```{r}
# Bivariate Regression

set.seed(1)
bin <- rbinom(1000, 1, 0.5)

out <- 2 * bin + rnorm(1000)

by(out, bin, mean)
```

```{r}
t.test(out ~ bin)
```

```{r}
lm(out ~ bin) # slope = mean difference
```

```{r}
summary(lm(out~bin))$coef
```

```{r}
plot(out ~ bin, col = "gray")
points(0:1, by(out, bin, mean), col = "blue", bg = "blue", pch = 23)
abline(coef(lm(out ~ bin)), col = "blue")
```

```{r}
set.seed(1)
x <- runif(1000, 0, 10)
y <- 3 * x + rnorm(1000, 0, 5)
```

```{r}
# glm plots
set.seed(1)
n <- 100
x <- runif(n, 0, 1)
y <- rbinom(n, 1, x) # more outcomes of 1 as x -> 1
```

```{r}
plot(y ~ x, col = NULL, bg = rgb(0, 0, 0, 0.5), pch = 21) # bg: background color (for points)
abline(lm(y ~ x), lwd = 2) # lwd: line width (default: 1)
# here linear doesn't work
```

```{r}
m1 <- glm(y ~ x, family = binomial(link = "logit"))
```

```{r}
newdf <- data.frame(x = seq(0, 1, length.out = 100))
newdf
```

```{r}
newdf$pout_logit <- predict(m1, newdf, se.fit = TRUE, type = "response")$fit
newdf[95:100,]
```

```{r}
# build confidence intervals from standard error
newdf$pse_logit <- predict(m1, newdf, se.fit = TRUE, type = "response")$se.fit
newdf$plower_logit <- newdf$pout_logit - (1.96 * newdf$pse_logit)  # 95% CI lower bound
newdf$pupper_logit <- newdf$pout_logit + (1.96 * newdf$pse_logit)  # 95% CI upper bound
# qnorm(c(0.025, 0.975)) = (-1.96, +1.96)
```

```{r}
newdf[,c(1,2,3,5,4)]
```

```{r}
# now plot predicted values
with(newdf, plot(pout_logit ~ x, type = "l", lwd = 2))
with(newdf, lines(pupper_logit ~ x, type = "l", lty = 2))
with(newdf, lines(plower_logit ~ x, type = "l", lty = 2))
```

```{r}
library(ggplot2)

# basic R's plot
plot(iris$Sepal.Width, iris$Sepal.Length)
```

```{r}
head(mpg)
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point()
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + geom_smooth(method = "lm") # with regression
```

```{r}
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point(aes(color = class)) # adding a dimension
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point(aes(size = class))
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point(aes(shape = class))
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point(aes(alpha = class))
```

```{r}
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + facet_grid(. ~ cyl) # 2D grid, rows ~ cols
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + facet_grid(drv ~ .) 
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + facet_grid(drv ~ cyl)
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + facet_wrap( ~ class)
# facet_wrap wraps a 1d sequence of panels into 2d. This is generally a better use of screen space than facet_grid because most displays are roughly rectangular.
```

```{r}
ggplot(data = mpg, aes(x = displ, y = hwy)) +  geom_point() + geom_smooth()
ggplot(data = mpg, aes(x = displ, y = hwy)) + geom_point() + geom_smooth(se = FALSE)  # Turn off confidence band
```

```{r}

```

```{r}

```

